# Import required packages
import os
import warnings
import numpy as np
import datetime as dt
import openpyxl
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from scipy import stats
import plotly.graph_objects as go # pip install plotly, conda install -c plotly plotly=4.8.1
import plotly.io as pio
import plotly.express as px
import plotly.offline as py
from plotly.offline import init_notebook_mode, plot_mpl
from fbprophet.plot import plot_plotly # pip install fbprophet
from fbprophet import Prophet
import pandas as pd
py.init_notebook_mode()
pd.options.plotting.backend = 'plotly' # change default pandas matplotlib to plotly
warnings.filterwarnings('ignore')
#pretty cell outputs: runs all codes in each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Import csv file
rawdf = pd.read_csv('./data/01_rawdata.csv')
rawdf.shape
rawdf.head()
# rawdf.info()
# check for null values
# rawdf.isna().sum()
# rawdf = rawdf.dropna() # to get rid of null values
# rawdf.shape
# # get rid of empty values
# rawdf.dropna(axis=1,how='all',inplace=True)
# rawdf.dropna(axis=0,how='all',inplace=True)
# rawdf.shape
# convert to datetime format
rawdf['date'] = pd.to_datetime(rawdf['date'])
# convert date as string and split date and time and keep only date
# rawdf.date = rawdf.date.apply(str).str.split(' ').str[0]
# stacking all hours into date time stamp making univariate dataset
rawdf = pd.melt(
rawdf,
id_vars=['date', 'zone_id'],
value_vars=['h_0', 'h_1', 'h_2', 'h_3', 'h_4', 'h_5', 'h_6',
'h_7', 'h_8', 'h_9', 'h_10', 'h_11', 'h_12', 'h_13', 'h_14', 'h_15',
'h_16', 'h_17', 'h_18', 'h_19', 'h_20', 'h_21', 'h_22', 'h_23'],
var_name='hour',
value_name='load_value')
# remove h_ from hour values
rawdf.hour = rawdf.hour.str.strip('h_')
rawdf.head()
# convert date as string and split date and time and keep only date
# rawdf['ds'] = rawdf.ds.apply(str).str.split(' ').str[0]
rawdf['date'] = pd.to_datetime(
rawdf.date, errors='coerce') + rawdf.hour.astype('timedelta64[h]')
rawdf.date.head()
rawdf = rawdf.drop(columns=['hour'])
rawdf.zone_id = rawdf.zone_id.astype('category') # to reduce size of dataframe
rawdf.head()
rawdf.info()
# keep zones as columns
rawdf = rawdf.pivot_table(
values='load_value',
index='date',
columns='zone_id',
dropna=False
)
# list comprehension method to add prefix zone_ in columns
rawdf.columns = ['zone_' + str(col) for col in rawdf.columns]
rawdf.head(2).append(rawdf.tail(2))
rawdf.shape
# Set notebook mode to work in offline
py.init_notebook_mode()
fig=rawdf.iloc[:,0:1].plot(template="ggplot2") # change pandas backend to plotly to replace matplotlib
fig = fig.update_layout(
yaxis_title="<b>Load value (in kW)</b>",
title="<b>Energy load demand </b>",
title_x=0.5,
xaxis_title= "<b>Date</b>")
fig
# take daily data
rawdf_day = rawdf.resample('D').sum()
rawdf_day.head(2).append(rawdf_day.tail(2))
rawdf_day.shape
rawdf_day.info()
The plot shows seasonality.
# Lets use only 5 zones
# data = rawdf.iloc[:, 0:5] # hourly data
data = rawdf_day.iloc[:, 0:5] # daily data
data.head(2).append(data.tail(2))
data.shape
# Set notebook mode to work in offline
py.init_notebook_mode()
# create output directory
if not os.path.exists("images"):
os.mkdir("images")
# define a function to plot graph
def plot_tsplot(col):
fig = go.Figure()
fig = fig.add_trace(go.Scatter(
x=data.index, y=data[col], mode='lines')) # mode='lines+markers'
fig = fig.update_layout(
width = 1000, height = 600,
title=dict(text="<b>Timeseries plot for daily demand for energy for "+col+"</b>",
y=0.9, x=0.5, font_size=18),
# hovermode='x',
xaxis=dict(
title="<b>Date</b>", linecolor='black', linewidth=1,
rangeslider_visible=True,
rangeselector=dict(
buttons=list(
[dict(count=7, label="1w", step="day", stepmode="backward"),
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(label='full', step="all")
])
),
ticks="outside"
),
yaxis=dict(title="<b>Load demand (in kW)</b>",
linecolor='black', linewidth=1, ticks="outside"),
font=dict(family="Helvetica", size=12, color="black"),
paper_bgcolor="white",
template="plotly",
# width=600, height=400
# legend=dict(
# title="Type", orientation="h", x=0.25, y=-0.6,
# bgcolor="blue", bordercolor="black", borderwidth=1
# )
)
fig.write_html("images/"+col+"_EDA_tstplot"".html") # write_image for other formats
fig.show()
# display for only 1 plot, change as required
for col in data.columns[0:1]:
plot_tsplot(col)
# plot method 2: with backend plotly and using ggplot2 theme
# for colnum in range(0, 5):
# data.iloc[:, colnum].plot(template="ggplot2").show()
# fig.update_layout(
# yaxis_title="<b>Load value (in kW)</b>",
# title="<b>Energy load demand </b>",
# title_x=0.5,
# xaxis_title= "<b>Date</b>")
# Export as csv file with 5 zones
data.to_csv("./data/02_clean_data.csv", index=True)
# TO RUN LOOP FOR ALL DATAFRAME and columns, DO STH LIKE THIS
# def eachzone(col):
# # ALL YOUR ACTIONS
# print(data.columns)
# for col in data:
# eachzone(col)
# Import holidays csv file
holidays = pd.read_csv('./data/holidays.csv')
holidays.ds=pd.to_datetime(holidays.ds)
# data.columns
# ['zone_1', 'zone_2', 'zone_3', 'zone_4', 'zone_5']
col = 'zone_1'
df = data[[col]].reset_index() # select each zone and reset index
# Prophet requires date = ds, feature = y
df.columns = ['ds', 'y']
df.head(2).append(df.tail(2))
First 80% of data are taken as training and remaining 20% as test dataset
# # split data into training and validation set
training_size = int(len(df)*0.8)
print(training_size)
train_df, test_df = df[: training_size], df[training_size:]
print(len(train_df)*100/(len(train_df)+len(test_df)))
Prophet will by default fit weekly and yearly seasonalities if the time series is more than two cycles long. It will also fit daily seasonality for a sub-daily time series. You can add other seasonalities (monthly, quarterly, hourly)if required.
# # # fit a prophet model on training dataset
# model = Prophet()
# model.fit(train_df)
# check EDA for monthly and quarterly trend or other and add seasonality based on that
# Do not run loop randomly for optimizing model
# HOliday --do not use for hourly data, holidays_prior_scale = 20-40, try big if holidays have high impact, default=10
# growth = linear default, if curve then logistic,
# fourier: try 10-25 (higher normally gives better result)
# period = 30.5 --> what happened at a certain point is likely to happen again in 35 days.
# seasonality_prior_scale: 10-25
# n_changepoints: sudden/abrupt change in trend, let prophet find itself and then add..
model = Prophet(
growth='linear',
# holidays=holidays,
seasonality_mode='additive',
# n_changepoints = 100, # default = 25
changepoint_prior_scale=0.05, # default = 0.05
# seasonality_prior_scale=15,
# holidays_prior_scale=10, # default = 10
daily_seasonality=False,
weekly_seasonality=False,
yearly_seasonality=False
# ).add_seasonality(name="monthly", period=30.5, fourier_order=6
# ).add_seasonality(name="daily", period=1,fourier_order=15
).add_seasonality(name="weekly", period=7,fourier_order=3
# ).add_seasonality(name="6month", period=30.5*6,fourier_order=4
).add_seasonality(name="yearly", period=365.25,fourier_order=10
# ).add_seasonality(name="quarterly", period=365.25/4, fourier_order=5, prior_scale=15
)
model.fit(train_df)
Create a dataframe with ds(datetime stamp) containing the dates for which we want to make the predictions.
# Add dataframe including test size, By default it includes dates from the history
# specify frequency as H if hourly and D if daily data
future = model.make_future_dataframe(periods=len(test_df), freq='D') # periods=300, freq='H', 'month'
# Predict for train/test dataset
forecast = model.predict(future)
# forecast1.tail().T # transpose dataframe
# forecast[(forecast.ds > "2008-6-22")].head().T
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
# define a funtion to create dataframe containing prediction and actual values
def actual_pred_dataframe(historical, forecast):
return forecast.set_index('ds')[
['yhat', 'yhat_lower', 'yhat_upper']].join(historical.set_index('ds'))
cmp_df = actual_pred_dataframe(df, forecast)
cmp_df.head(2).append(cmp_df.tail(2))
cmp_df.to_csv("./data/03_cmp_df.csv", index=True) # Export as csv file
MAPE: mean absolute percentage error
MAE: mean absolute error
# define function to calculate MAPE and MAE
def calculate_forecast_errors(dataframe, training_size):
dataframe = dataframe.copy()
dataframe['e'] = dataframe['y'] - dataframe['yhat']
dataframe['p'] = 100 * dataframe['e'] / dataframe['y']
predicted_part = dataframe[training_size:] # test data part
def error_mean(error_name):
return np.mean(abs(predicted_part[error_name])).round(2)
return {'MAPE': error_mean('p'), 'MAE': error_mean('e')}
# Print MAPE and MAE
for error_name, err_value in calculate_forecast_errors(cmp_df, training_size).items():
print(error_name, err_value)
Basic model: MAPE 11.4; MAE 187042.95
Tweaked model: MAPE 11.38; MAE 186657.72
# display forecast components Prophet.plot_components method
# trend, yearly seasonality, and weekly seasonality of the time series
# include holiday to see holidays
component_plot = model.plot_components(forecast)
component_plot.savefig("images/"+col+"_component_plot_train"".pdf") # pandas---savefig, write_html
# py.init_notebook_mode(connected=True) # connection=false,default,requires internet, file size high
# convert the plot into interactive one
# component_plot = plot_mpl(component_plot, filename="images/component_plot_train.html", auto_open=True) #opens graph in html in brower
the great recession, a global economic downturn that devastated word financial markets as well as the banking and real estate industries.py.init_notebook_mode() # make it true if you want reduce file, but you need to be online
forecast_plot = plot_plotly(model, forecast) # This returns a plotly Figure
forecast_plot.write_html("images/forecast_plot_traintest.html", auto_open=False) # turn it to true if you want to open it on own.show
forecast_plot.show()
# Set notebook mode to work in offline
py.init_notebook_mode()
fig = go.Figure()
fig = fig.add_trace(go.Scatter(
x=cmp_df.index, y=cmp_df.y, mode='markers', name='actual',line_color='rgb(0,100,80)'))
fig = fig.add_trace(go.Scatter(
x=cmp_df.index, y=cmp_df.yhat, mode='lines', name='predicted', line_color='rgb(231,107,243)'))
fig = fig.update_layout(
width=1000, height=600,
title=dict(text="<b>Prophet model train test</b>", y=0.9, x=0.5, font_size=18),
xaxis=dict(title="<b>Date</b>", linecolor='black', linewidth=1,
rangeselector=dict(
buttons=list(
[dict(count=7, label="1w", step="day",
stepmode="backward"),
dict(count=1, label="1m", step="month",
stepmode="backward"),
dict(count=6, label="6m", step="month",
stepmode="backward"),
dict(count=1, label="1y", step="year",
stepmode="backward"),
dict(label='full', step="all")
])
)),
yaxis=dict(title="<b>Load demand (in kW)</b>", linecolor='black', linewidth=1),
font=dict(family="Helvetica", size=12, color="black"),
paper_bgcolor="white"
)
fig = fig.add_shape(
# filled Rectangle
type="rect",
x0="2007-08-01", y0=0,
x1="2008-06-22", y1=3000000,
line=dict(
color="black",
width=1,
),
fillcolor="red", opacity=0.09, layer="below", line_width=0
)
fig.write_html("images/"+col+"_Train_Test_actualvspred"".html") # write_image for other formats
fig.show()
# test_forecast = forecast[training_size:]
# from math import sqrt
# from sklearn.metrics import mean_squared_error
# def mean_absolute_percentage_error(y_true, y_pred):
# return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# def errors(y_true,y_pred):
# rmse = sqrt(mean_squared_error(y_true, y_pred))
# mape = mean_absolute_percentage_error(y_true, y_pred)
# return{'MAPE':mape,
# 'RMSE':rmse}
# errors(test_df.y, test_forecast.yhat)
# # fit prophet model with similar parameters on full dataset
model2 = Prophet(
growth='linear',
seasonality_mode='additive',
changepoint_prior_scale=0.05, # default = 0.05
daily_seasonality=False,
weekly_seasonality=False,
yearly_seasonality=False
).add_seasonality(name="weekly", period=7,fourier_order=3
).add_seasonality(name="yearly", period=365.25,fourier_order=10
)
model2.fit(df)
future2 = model2.make_future_dataframe(periods=30, freq='D') # 24*7 days for hourly data
# Predict for train/test dataset
forecast2 = model2.predict(future2)
# forecast2[(forecast2.ds > "2008-6-23")]
# just the forecasted part not the whole dataset
final_forecast = forecast2.set_index('ds')
final_forecast = final_forecast[['yhat', 'yhat_lower', 'yhat_upper']].tail(30)
final_forecast.to_csv("./data/04_final_forecast.csv", index=True) # Export as csv file
cmp_df.head(2).append(cmp_df.tail(2)) # cmp_df: full dataset with actual and prediction
final_forecast.head(2).append(final_forecast.tail(2)) # final_forecast: future predicted dataset y-hat and confidence interval
# for final_forecast upper and lower limit
x3 = final_forecast.index
x3_rev = x3[::-1] # make reverse
y3 = final_forecast.yhat
y3_upper = final_forecast.yhat_upper
y3_lower = final_forecast.yhat_lower
y3_lower = y3_lower[::-1]
# Set notebook mode to work in offline
py.init_notebook_mode()
# create figure
fig = go.Figure()
# Add line 1
fig = fig.add_trace(go.Scatter(
x=cmp_df.index, y=cmp_df.y, mode='lines', name='actual', line_color='rgb(0,100,80)'))
# Add line 2
fig = fig.add_trace(go.Scatter(
x=cmp_df.index, y=cmp_df.yhat, mode='lines', name='predicted', line_color='rgb(231,107,243)'))
# Add lower and upper limit area for line 3
fig = fig.add_trace(go.Scatter(
x=x3.append(x3_rev), # are dataframe, so append
y=y3_upper.append(y3_lower),
fill='toself', fillcolor='rgba(0,176,246,0.2)', line_color='rgba(255,255,255,0)',
name='future_prediction', showlegend=False))
# Add line 3
fig = fig.add_trace(go.Scatter(
x=final_forecast.index, y=final_forecast.yhat, mode='lines', name='future_prediction', line_color='rgb(0,176,246)'))
# update layout
fig = fig.update_layout(
width=1000, height=600,
title=dict(text="<b>Prophet model train test</b>",
y=0.9, x=0.5, font_size=18),
xaxis=dict(title="<b>Date</b>", linecolor='black', linewidth=1, ticks="outside",
rangeslider_visible=True,
rangeselector=dict(
buttons=list(
[dict(count=7, label="1w", step="day",
stepmode="backward"),
dict(count=1, label="1m", step="month",
stepmode="backward"),
dict(count=6, label="6m", step="month",
stepmode="backward"),
dict(count=1, label="1y", step="year",
stepmode="backward"),
dict(label='full', step="all")
])
)),
yaxis=dict(title="<b>Load demand (in kW)</b>",
linecolor='black', linewidth=1, ticks="outside"),
font=dict(family="Helvetica", size=12, color="black"),
paper_bgcolor="white")
fig = fig.add_shape(
# filled Rectangle
type="rect",
x0="2007-08-01", y0=0,
x1="2008-06-22", y1=3000000,
line=dict(
color="black",
width=1,
),
fillcolor="red", opacity=0.09, layer="below", line_width=0
)
fig.write_html("images/"+col+"_final_graph"".html") # write_image for other formats
fig.show()
# Set notebook mode to work in offline
py.init_notebook_mode()
forecast_plot = plot_plotly(model2, forecast2) # This returns a plotly Figure
forecast_plot.write_html("images/"+col+"_final_graph_2"".html", auto_open=False)
forecast_plot.show()
# display forecast components Prophet.plot_components method
# trend, yearly seasonality, and weekly seasonality of the time series
# include holiday to see holidays
# interactive plots
py.init_notebook_mode(connected=False)
component_plot = model2.plot_components(forecast2)
component_plot.savefig("images/"+col+"_component_plot_full"".pdf") # pandas---savefig, write_html